Preprocessing

First, we get the dimension of the dataset.

XX <- read.csv("july2017.csv", header = T) 
X <- XX[,-1] #first columne is the name
rownames(X) <- XX[,1]
dim(X)
## [1] 3353  313

Deleting observations

There are 0.1340624 NA values in total.

Investigate any structure of missing observations.

library(mice)
## Warning: package 'mice' was built under R version 3.4.2
## Loading required package: lattice
pattern <- md.pattern(X)
pattern <- pattern[,-ncol(pattern)] #delete the last chr of names which is "", no use at all
pattern_last <- pattern[nrow(pattern),]
pattern_last_df <- data.frame(variable_name=names(pattern_last),num_missing=pattern_last)
DT::datatable(pattern_last_df,rownames = FALSE)

Omit the variables that consist of more than 2184 NAs and samples contain more than 50 NAs.

 names <- names(which(pattern_last>2184))
X.1 <- X[,!(colnames(X) %in% names)]
rs <- apply(X.1,1,function(y) sum(is.na(y)))
X.2 <- X.1[-which(rs>50),]
row_name <- XX[,1]
row_name <- row_name[-which(rs>50)]

There are 0.0231252 NA values in total now.

filling in & assorting variables

getmode <- function(v) {
  uniqv <- unique(v)
  uniqv[which.max(tabulate(match(v, uniqv)))]
}
# fill in NA value & determine factor variables
# if X.2[,i] has less than 6 unique variables, then we regard it as a factor
lu <- apply(X.2,2,function(y) length(unique(y,na.rm = TRUE)))
for(i in which(lu<6 | lu==6)) {
  X.2[is.na(X.2[,i]),i] <-getmode(X.2[,i])
  X.2[,i] <- factor(X.2[,i])
  }

# fill in NA value & determine numerical variables
for(i in which(lu>6)){
  X.2[,i] <- as.numeric(X.2[,i])
  temp_median <- median(X.2[,i], na.rm = TRUE) 
  X.2[is.na(X.2[,i]),i] <- temp_median
}

#delete variables that is constant through all the samples
X.3=X.2[,!(colnames(X.2) %in% names(which(lu==1)))]
rownames(X.3) <- row_name
X.3 <- X.3[,c(146,1:145,147:270)]
DT::datatable(X.3)
## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html

lasso for mi (heart attack)

Choose mi: Heart attack (self-reported history) as the output, other variables except

  • CVD: Cardiovasuclar disease
  • db: Diabetes (self-reported history)
  • ht: Hypertension (self-reported history)
  • st: Stroke (self-reported history)

as the input.

rm(list=ls())
load("./normalized.Rdata")
library(glmnet)
## Warning: package 'glmnet' was built under R version 3.4.2
## Loading required package: Matrix
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 3.4.2
## Loaded glmnet 2.0-13
names.wanted <- c("mi","st","ht","db","CVD")
y=data[,"mi"]
x=data[,!(colnames(data) %in% names.wanted)]
#hotcode
xd <- model.matrix(~.,x)[,-1] #delete the first columne which is the intercept, since glmnet will introduce an interception by default

Train & test

The distribution of output:

set.seed(1)
idx<-sample(nrow(xd),1/10*nrow(xd))
trainx<-xd[-idx,]
testx<-xd[idx,]
trainy<-y[-idx]
testy<-y[idx]
rm(xd)
cat("distribution of training output:")
## distribution of training output:
summary(trainy)
##    1    2 
##  110 2839
cat("distribution of test output:")
## distribution of test output:
summary(testy)
##   1   2 
##  14 313

One hot encoding of input. Take a look at the test input:

DT::datatable(testx)

Dimension of training input:(2949, 339); dimension of test input:(327, 339); length of training output:(2949); length of test output:(327).

coefficient path of lambda

\(L=-\frac{1}{n}loglik+\lambda||\beta||_1\)

fit = glmnet(trainx, trainy, family="binomial", nlambda=50, alpha=1)
plot(fit, xvar="lambda", label=TRUE)

best lambda through CV: dev

The y-axis is the average of 10 dev on the leave out data.

\(dev=ResDev/N\), where \(ResDev=-2*loglik\).

lbdl <- matrix(0,nrow=4,ncol=6)
lbdl <- data.frame(lbdl)
rownames(lbdl)=c("CV_deviance","CV_misclassificaiton_rate","BIC","EBIC"); colnames(lbdl) <- c("lambda","num_non_zero","CCR_train","HUM_train","CCR_test","HUM_test")

library(doParallel)
## Warning: package 'doParallel' was built under R version 3.4.3
## Loading required package: iterators
## Loading required package: parallel
cl <- makeCluster(3)
registerDoParallel(cl)

cvfit = cv.glmnet(trainx, trainy, family = "binomial", type.measure = "deviance",alpha=1,nfolds=10,parallel=T,standardize = T)
stopImplicitCluster()
plot(cvfit)

The best lambda is 0.0040135.

Coefs

lbdl[1,1] <- cvfit$lambda.min
cosp <- coef(cvfit, s=c(cvfit$lambda.min))
co_l_cv_mce <- as.matrix(cosp)[(1:cosp@Dim[1]) %in% summary(cosp)[,"i"],]
lbdl[1,2] <- length(co_l_cv_mce)
co_l_cv_mce
##       (Intercept)           gender2            agegp3           agegp23 
##     -5.665137e+00      7.728798e-01     -9.014958e-02     -2.553986e-05 
##           agegp25           bppul_f        anti_chol1     drugs_others1 
##     -7.116925e-01      5.036103e-03     -3.381137e-01     -7.593035e-01 
##     hypertension1    hypertension32 hyperlipidaemia11       blood_data1 
##     -6.863052e-01     -1.857029e-01     -7.158593e-01      5.628131e-01 
##            bldglu              chol           GFR_EPI   better_eye_bvaR 
##     -1.759280e-02      5.744405e-01      1.191050e-02      1.766079e-01 
##      pvalogr_USA2      bvalogr_USA2      bvalogl_USA3      bvalogl_WHO2 
##     -9.518539e-02     -1.007607e-01      1.434309e-01     -3.206571e-01 
##      undercorr_L1       bva_bi_USA4          srepsphl          srepcylr 
##     -1.453349e-02     -7.170429e-02      4.296226e-03      1.351580e-01 
##          srepcyll    anisometropia1      RE_type_pva2   highmyopia_pva1 
##     -1.033051e-01     -2.285010e-01      1.782176e-01     -1.547580e-01 
##   emmetropia_pva1         lenstatr4          rel_cat4         job_cat32 
##      1.086332e-05     -2.985753e-01      1.910830e-02     -3.403103e-02 
##             home4            smkyn2          smk_cat3              ang2 
##     -6.574953e-02      5.436503e-01     -1.613196e-01      2.266561e+00 
##           htchol2             avccr         IVAN_CRAE          IVAN_AVR 
##      7.751236e-02      1.966593e-01      2.058212e-03      5.257568e-01 
##    L_AMDgradable1  any_AMDgradable1        pigment_R1      R_early_amd1 
##     -5.697015e-01      2.240904e+00     -1.847695e-01     -2.628506e-02 
##     geoatrophy_L1   L_late_amd_dry1 any_late_amd_dry1     any_late_amd1 
##     -6.583244e-01      1.363367e-02      4.012796e-02      8.283072e-01 
##        amdtype_R1      amdtype_any2   any_DRgradable1     R_retino_cat2 
##     -1.670538e-02      1.205952e-04      1.324717e-02     -1.047815e+00 
##      R_VT_retino1        L_NDR_cat2           R_CSME1         CSME_any1 
##     -3.293194e-02     -1.975072e-01     -6.056883e-01     -6.042936e-01 
##       sec_glau_R1           POAG_L1  CataractGrading1        nuclear_L1 
##     -2.588122e-01     -4.180975e-01     -6.387850e-01     -2.402566e-02 
##        rcrtgp2_R3         rpscar2_R        rpscgp2_R2 
##      1.695987e-01     -1.353650e-02     -2.393507e-01

There are 63 no-zero variables.

Acurracy

library(mcca)
pvtrain1 <- predict(cvfit,newx=trainx,type="response",s=c(cvfit$lambda.min))
lbdl[1,3] <- ccr(trainy,data.frame(1-pvtrain1,pvtrain1),"prob",2)
lbdl[1,4] <- hum(trainy,data.frame(1-pvtrain1,pvtrain1),"prob",2)
cat("Correct classification rate of traing data:",lbdl[1,3])
## Correct classification rate of traing data: 0.9677857
cat("Area under curve of traing data:",lbdl[1,4])
## Area under curve of traing data: 0.9367191
pvtest1 <- predict(cvfit,newx=testx,type="response",s=c(cvfit$lambda.min))
lbdl[1,5] <- ccr(testy,data.frame(1-pvtest1,pvtest1),"prob",2)
lbdl[1,6] <- hum(testy,data.frame(1-pvtest1,pvtest1),"prob",2)
cat("Correct classification rate of test data:",lbdl[1,5])
## Correct classification rate of test data: 0.9602446
cat("Area under curve of test data:",lbdl[1,6])
## Area under curve of test data: 0.8455043

best lambda through CV: ME

cl <- makeCluster(3)
registerDoParallel(cl)

cvfit = cv.glmnet(trainx, trainy, family = "binomial", type.measure = "class",alpha=1,nfolds=10,parallel=T,standardize = T)
stopImplicitCluster()
plot(cvfit)

The best lambda is 0.0111678.

Coefs

lbdl[2,1] <- cvfit$lambda.min
cosp <- coef(cvfit, s=c(cvfit$lambda.min))
co_l_cv_mce <- as.matrix(cosp)[(1:cosp@Dim[1]) %in% summary(cosp)[,"i"],]
lbdl[2,2] <- length(co_l_cv_mce)
co_l_cv_mce
##   (Intercept)       gender2       agegp25      anti_ht1    anti_chol1 
##   -0.87184018    0.32394937   -0.23154342   -0.19487611   -0.97372931 
## drugs_others1 hypertension1          chol       GFR_EPI  bvalogr_USA2 
##   -0.48861967   -0.07457401    0.35259987    0.01144681   -0.16002465 
##        smkyn2      smk_cat3          ang2 R_retino_cat2 
##    0.32563476   -0.19493706    1.93992455   -0.47197565

There are 14 no-zero variables.

Acurracy

library(mcca)
pvtrain1 <- predict(cvfit,newx=trainx,type="response",s=c(cvfit$lambda.min))
lbdl[2,3] <- ccr(trainy,data.frame(1-pvtrain1,pvtrain1),"prob",2)
lbdl[2,4] <- hum(trainy,data.frame(1-pvtrain1,pvtrain1),"prob",2)
cat("Correct classification rate of traing data:",lbdl[2,3])
## Correct classification rate of traing data: 0.9637165
cat("Area under curve of traing data:",lbdl[2,4])
## Area under curve of traing data: 0.9105655
pvtest1 <- predict(cvfit,newx=testx,type="response",s=c(cvfit$lambda.min))
lbdl[2,5] <- ccr(testy,data.frame(1-pvtest1,pvtest1),"prob",2)
lbdl[2,6] <- hum(testy,data.frame(1-pvtest1,pvtest1),"prob",2)
cat("Correct classification rate of test data:",lbdl[2,5])
## Correct classification rate of test data: 0.9633028
cat("Area under curve of test data:",lbdl[2,6])
## Area under curve of test data: 0.8523505

Best lambda through BIC

\(BIC=-2loglik+df*log(n)\)

library(grpreg)
## Warning: package 'grpreg' was built under R version 3.4.3
fit<- grpreg(X=trainx, y=trainy, penalty="grLasso",family = "binomial",group=1:ncol(trainx))
## [1] "Logistic regression modeling Pr(y=2)"
## Warning: 关闭不再使用的链结7(<-Gao-PC:11314)
## Warning: 关闭不再使用的链结6(<-Gao-PC:11314)
## Warning: 关闭不再使用的链结5(<-Gao-PC:11314)
Lambda <- fit$lambda
xlim <- range(Lambda)
plot(Lambda,select(fit,crit="BIC")$IC,xlim=xlim,pch=19,type="o",ylab="BIC",col="red")
lbdl[3,1] <- select(fit,criterion = "BIC")$lambda
abline(v=lbdl[3,1],lwd=3)

Coefs

The best lambda is 0.0101757.

cosp <- select(fit,criterion = "BIC")$beta
cosp[which(cosp!=0)]
##   (Intercept)       gender2       agegp25      anti_ht1    anti_chol1 
##   -0.89576230    0.37922805   -0.28631894   -0.18955312   -0.98535431 
## drugs_others1 hypertension1          chol       GFR_EPI  bvalogr_USA2 
##   -0.52309536   -0.13673843    0.36921762    0.01122648   -0.18884284 
##      srepcylr        smkyn2      smk_cat3          ang2 R_retino_cat2 
##    0.01703296    0.34831108   -0.19912430    1.97564511   -0.56212888
lbdl[3,2] <- length(cosp[which(cosp!=0)])

There are 15 non-zero vbs.

Acurracy
library(mcca)
pre <- predict(fit, X=trainx, type = "response",lambda = lbdl[3,1])
pre <- data.matrix(pre)
lbdl[3,3] <- ccr(y=as.numeric(trainy),d=data.frame(1-pre,pre),method="prob",k=2)
lbdl[3,4] <- hum(y=as.numeric(trainy),d=data.frame(1-pre,pre),method="prob",k=2)

pre <- predict(fit, X=testx, type = "response",lambda = lbdl[3,1])
pre <- data.matrix(pre)
lbdl[3,5] <- ccr(y=as.numeric(testy),d=data.frame(1-pre,pre),method="prob",k=2)
lbdl[3,6] <- hum(y=as.numeric(testy),d=data.frame(1-pre,pre),method="prob",k=2)
cat("Correct classification rate of test data:",lbdl[3,5])
## Correct classification rate of test data: 0.9633028
cat("Area under curve of test data:",lbdl[3,6])
## Area under curve of test data: 0.8505249

Best lambda through EBIC

\(EBIC=-2loglik+df*log(n)+df*2*0.5*log(P)\)

library(grpreg)
Lambda <- fit$lambda
xlim <- range(Lambda)
bic <- select(fit,crit="BIC",df.method="active")$IC
ebic <- bic+2*0.5*log(dim(trainx)[2])*predict(fit,type="nvars")
plot(Lambda,ebic,xlim=xlim,pch=19,type="o",ylab="EBIC",col="red")
lbdl[4,1] <- Lambda[which(ebic==min(ebic,na.rm=T))]
abline(v=lbdl[4,1],lwd=3)

Coefs

The best lambda is 0.0162026.

cosp <- predict(fit,type="coefficients",lambda = lbdl[4,1])
cosp[which(cosp!=0)]
##  [1] -0.47821245  0.08003702 -0.10860548 -0.91744884 -0.31589203
##  [6]  0.27067687  0.01188295  0.21152310 -0.15449982  1.77293117
lbdl[4,2] <- length(cosp[which(cosp!=0)])

There are 10 non-zero vbs.

Acurracy

library(mcca)
pre <- predict(fit, X=trainx, type = "response",lambda = lbdl[4,1])
pre <- data.matrix(pre)
lbdl[4,3] <- ccr(y=as.numeric(trainy),d=data.frame(1-pre,pre),method="prob",k=2)
lbdl[4,4] <- hum(y=as.numeric(trainy),d=data.frame(1-pre,pre),method="prob",k=2)

pre <- predict(fit, X=testx, type = "response",lambda = lbdl[4,1])
pre <- data.matrix(pre)
lbdl[4,5] <- ccr(y=as.numeric(testy),d=data.frame(1-pre,pre),method="prob",k=2)
lbdl[4,6] <- hum(y=as.numeric(testy),d=data.frame(1-pre,pre),method="prob",k=2)
cat("Correct classification rate of test data:",lbdl[4,5])
## Correct classification rate of test data: 0.9633028
cat("Area under curve of test data:",lbdl[4,6])
## Area under curve of test data: 0.846189

group lasso for mi

train & test

vb <- c("trainx","trainy","testx","testy","lbdl")
rm(list=ls()[!ls() %in% vb])

testy=as.numeric(testy)-1
trainy=as.numeric(trainy)-1

Index vector

load("./index.Rdata")
index <- index[-1]
index
##   [1]   1   2   3   4   4   4   5   5   5   5   6   7   8   9  10  11  12
##  [18]  13  14  15  15  15  16  17  18  19  20  21  22  23  24  24  24  25
##  [35]  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40  41  42
##  [52]  43  43  44  44  45  45  46  46  47  47  48  48  49  49  50  50  51
##  [69]  51  52  53  54  55  55  55  55  55  56  56  56  56  56  57  57  57
##  [86]  57  57  58  58  58  58  58  59  60  61  62  63  64  65  66  67  68
## [103]  69  69  70  70  71  71  72  73  74  75  76  77  78  79  80  81  82
## [120]  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  97  97
## [137]  98  98  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112
## [154] 113 114 115 116 116 117 117 118 119 120 121 122 123 124 125 126 126
## [171] 126 127 127 127 128 128 128 128 129 130 131 132 132 132 132 133 133
## [188] 133 134 135 136 137 138 138 139 140 141 142 143 144 145 146 147 148
## [205] 149 150 151 151 151 152 153 154 155 156 157 158 159 160 161 162 163
## [222] 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## [239] 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 196
## [256] 197 197 198 198 199 200 201 202 203 204 205 205 205 205 206 207 208
## [273] 209 210 211 212 213 214 214 214 214 215 215 215 215 216 217 218 219
## [290] 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236
## [307] 237 238 239 240 241 242 243 244 244 245 246 247 247 248 249 250 251
## [324] 251 252 253 254 254 255 256 257 258 259 260 261 262 263 264 265

Coef path

\(L=-\frac{1}{n}loglik+\lambda\sum_j ||\beta_j||_2\)

library(grpreg)
fit <- grpreg(X=trainx, y=trainy, group=index, penalty="grLasso",family = "binomial")
plot(fit,label=T,log.l=T)

Best lambda through CV: deviance

lbd <- matrix(0,nrow=4,ncol=6)
lbd <- data.frame(lbd)
rownames(lbd)=c("CV_deviance","CV_misclassificaiton_rate","BIC","EBIC"); colnames(lbd) <- c("lambda","num_non_zero","CCR_train","HUM_train","CCR_test","HUM_test")
cvfit <- cv.grpreg(X=trainx, y=trainy, group=index, penalty="grLasso",family = "binomial",nfolds=10,seed=1,trace=F)
lbd[1,1]<- cvfit$lambda.min
plot(cvfit)

Coefs

The best lambda is 0.0063906.

cosp <- coef(cvfit,lambda=cvfit$lambda.min)
cosp[which(cosp!=0)]
##       (Intercept)           gender2               age           agegp22 
##     -2.132646e+00      5.474386e-01     -5.195624e-03      7.122076e-04 
##           agegp23           agegp24           agegp25          anti_ht1 
##     -2.038723e-03      7.605449e-04     -1.418967e-02     -1.407851e-01 
##        anti_chol1     drugs_others1     hypertension1 hyperlipidaemia11 
##     -9.161848e-01     -6.972990e-01     -4.264602e-01     -9.828030e-02 
##            bldglu              chol           GFR_EPI   better_eye_bvaR 
##     -3.143733e-03      4.539643e-01      1.113910e-02      1.815634e-01 
##          srepcylr    anisometropia1   emmetropia_pva1            smkyn2 
##      9.564015e-02     -1.370607e-01      3.040640e-02      5.979631e-01 
##              ang2         IVAN_CRAE    L_AMDgradable1  any_AMDgradable1 
##      2.148121e+00      1.671462e-05     -1.109394e-01      1.218256e+00 
##        pigment_R1     any_late_amd1   any_DRgradable1         R_retino1 
##     -5.204764e-02      1.748310e-02      4.425398e-03     -1.882591e-02 
##     R_retino_cat1     R_retino_cat2     R_retino_cat3     R_retino_cat5 
##      3.806642e-02     -3.420575e-01     -1.273258e-01     -1.635117e-01 
##  retino_cat_worse           R_CSME1         CSME_any1           POAG_L1 
##     -9.113397e-03     -2.257232e-01     -3.576518e-01     -9.945183e-02 
##         rpscar2_R 
##     -1.046879e-02
lbd[1,2] <- length(cosp[which(cosp!=0)])

There are 37 non-zero vbs.

Acurracy
library(mcca)
pre <- predict(cvfit, X=trainx, type = "response",lambda = lbd[1,1])
pre <- data.matrix(pre)
lbd[1,3] <- ccr(y=trainy+1,d=data.frame(1-pre,pre),method="prob",k=2)
lbd[1,4] <- hum(y=trainy+1,d=data.frame(1-pre,pre),method="prob",k=2)

pre <- predict(cvfit, X=testx, type = "response",lambda = lbd[1,1])
pre <- data.matrix(pre)
lbd[1,5] <- ccr(y=testy+1,d=data.frame(1-pre,pre),method="prob",k=2)
lbd[1,6] <- hum(y=testy+1,d=data.frame(1-pre,pre),method="prob",k=2)
cat("Correct classification rate of test data:",lbd[1,5])
## Correct classification rate of test data: 0.9602446
cat("Area under curve of test data:",lbd[1,6])
## Area under curve of test data: 0.8386581

Best lambda through CV: misclassification error

plot(cvfit,type="pred",vertical.line = F)
cvfit_sum <- summary(cvfit)
lbd[2,1] <- cvfit_sum$lambda[which(cvfit_sum$pe ==min(cvfit_sum$pe))]
abline(v=log(lbd[2,1]),lwd=3)

The best lambda is 0.008448.

Coefs

cosp <- coef(cvfit,lambda=lbd[2,1])
lbd[2,2] <- length(cosp[which(cosp!=0)])
cosp[which(cosp!=0)]
##      (Intercept)          gender2              age         anti_ht1 
##    -1.262383e+00     4.273564e-01    -5.085371e-03    -1.732883e-01 
##       anti_chol1    drugs_others1    hypertension1             chol 
##    -9.880625e-01    -6.096063e-01    -2.466116e-01     4.058881e-01 
##          GFR_EPI  better_eye_bvaR         srepcylr   anisometropia1 
##     1.127762e-02     1.089449e-01     6.633619e-02    -3.917945e-02 
##           smkyn2             ang2 any_AMDgradable1  any_DRgradable1 
##     5.399146e-01     2.054450e+00     3.531465e-01     7.914514e-07 
## retino_cat_worse        CSME_any1        rpscar2_R 
##    -4.175693e-02    -1.699728e-01    -4.661313e-03

There are 19 non-zero vbs.

Acurracy

library(mcca)
pre <- predict(cvfit, X=trainx, type = "response",lambda = lbd[2,1])
pre <- data.matrix(pre)
lbd[2,3] <- ccr(y=trainy+1,d=data.frame(1-pre,pre),method="prob",k=2)
lbd[2,4] <- hum(y=trainy+1,d=data.frame(1-pre,pre),method="prob",k=2)

pre <- predict(cvfit, X=testx, type = "response",lambda = lbd[2,1])
pre <- data.matrix(pre)
lbd[2,5] <- ccr(y=testy+1,d=data.frame(1-pre,pre),method="prob",k=2)
lbd[2,6] <- hum(y=testy+1,d=data.frame(1-pre,pre),method="prob",k=2)
cat("Correct classification rate of test data:",lbd[2,5])
## Correct classification rate of test data: 0.9633028
cat("Area under curve of test data:",lbd[2,6])
## Area under curve of test data: 0.8322684

Best lambda through BIC

Lambda <- fit$lambda
xlim <- range(Lambda)
plot(Lambda,select(fit,crit="BIC",df.method="default")$IC,xlim=xlim,pch=19,type="o",ylab="BIC",col="red")
lbd[3,1] <- select(fit,criterion = "BIC")$lambda
abline(v=lbd[3,1],lwd=3)

Coefs

The best lambda is 0.0092717.

cosp <- select(fit,criterion = "BIC")$beta
cosp[which(cosp!=0)]
##      (Intercept)          gender2              age         anti_ht1 
##     -0.866920415      0.384591621     -0.005063179     -0.182959437 
##       anti_chol1    drugs_others1    hypertension1             chol 
##     -0.980664378     -0.575060465     -0.188601266      0.390773848 
##          GFR_EPI  better_eye_bvaR         srepcylr   anisometropia1 
##      0.011383520      0.076532787      0.051032856     -0.001409315 
##           smkyn2             ang2 retino_cat_worse        CSME_any1 
##      0.516494523      2.019505417     -0.034955107     -0.031755151 
##        rpscar2_R 
##     -0.001904645
lbd[3,2] <- length(cosp[which(cosp!=0)])

There are 17 non-zero vbs.

Acurracy

library(mcca)
pre <- predict(fit, X=trainx, type = "response",lambda = lbd[3,1])
pre <- data.matrix(pre)
lbd[3,3] <- ccr(y=trainy+1,d=data.frame(1-pre,pre),method="prob",k=2)
lbd[3,4] <- hum(y=trainy+1,d=data.frame(1-pre,pre),method="prob",k=2)

pre <- predict(cvfit, X=testx, type = "response",lambda = lbd[3,1])
pre <- data.matrix(pre)
lbd[3,5] <- ccr(y=testy+1,d=data.frame(1-pre,pre),method="prob",k=2)
lbd[3,6] <- hum(y=testy+1,d=data.frame(1-pre,pre),method="prob",k=2)
cat("Correct classification rate of test data:",lbd[3,5])
## Correct classification rate of test data: 0.9633028
cat("Area under curve of test data:",lbd[3,6])
## Area under curve of test data: 0.8343222

Best lambda through EBIC

library(grpreg)
Lambda <- fit$lambda
xlim <- range(Lambda)
bic <- select(fit,crit="BIC",df.method="active")$IC
ebic <- bic+2*0.5*log(dim(trainx)[2])*predict(fit,type="nvars")
plot(Lambda,ebic,xlim=xlim,pch=19,type="o",ylab="EBIC",col="red")
lbd[4,1] <- Lambda[which(ebic==min(ebic,na.rm=T))]
abline(v=lbd[4,1],lwd=3)

Coefs

The best lambda is 0.0147632.

cosp <- predict(fit,type="coefficients",lambda = lbd[4,1])
cosp[which(cosp!=0)]
## [1] -0.74786667  0.13265368 -0.15407222 -0.93135420 -0.37203412  0.29529415
## [7]  0.01227613  0.34899345  1.81360064
lbd[4,2] <- length(cosp[which(cosp!=0)])

There are 9 non-zero vbs.

Acurracy
library(mcca)
pre <- predict(fit, X=trainx, type = "response",lambda = lbd[4,1])
pre <- data.matrix(pre)
lbd[4,3] <- ccr(y=trainy+1,d=data.frame(1-pre,pre),method="prob",k=2)
lbd[4,4] <- hum(y=trainy+1,d=data.frame(1-pre,pre),method="prob",k=2)

pre <- predict(fit, X=testx, type = "response",lambda = lbd[4,1])
pre <- data.matrix(pre)
lbd[4,5] <- ccr(y=testy+1,d=data.frame(1-pre,pre),method="prob",k=2)
lbd[4,6] <- hum(y=testy+1,d=data.frame(1-pre,pre),method="prob",k=2)
cat("Correct classification rate of test data:",lbd[4,5])
## Correct classification rate of test data: 0.9633028
cat("Area under curve of test data:",lbd[4,6])
## Area under curve of test data: 0.8409402

sparse group lasso for mi

train & test

vb <- c("trainx","trainy","testx","testy","lbdl","lbd","index")
rm(list=ls()[!ls() %in% vb])
trainy <- factor(trainy)
testy <- factor(testy)
library(msgl)
## Warning: package 'msgl' was built under R version 3.4.3
## Loading required package: sglOptim
## Warning: package 'sglOptim' was built under R version 3.4.3

Coef path

\(-loglik+\lambda[(1-\alpha)\sum_{j=1}^J r_j*||\beta_j||_2+\alpha||\beta||_1]\)

Where \(r_j=(2*\#\{\beta_j\})^\frac{1}{2}\)

l <- lambda(x=trainx, classes=trainy,d=200,lambda.min=0.001, intercept = TRUE,standardize = TRUE,grouping =index)
fit <- msgl::fit(x=trainx, classes=trainy, alpha = 0.5, lambda = l,grouping = index,
                 standardize = TRUE,d = 200, intercept = TRUE)
## 
## Running msgl (dense design matrix) 
## 
##  Samples:  Features:  Classes:  Groups:  Parameters: 
##     2.949k        340         2      266          680
be <- fit$beta
df <- matrix(0,340,200)
for (i in 1:200){
  tmp <- be[[i]]
  df[,i] <- as.matrix(tmp)[1,]
}
cl <- rainbow(340)
plot(0,0,xlim = c(0,0.05),ylim = c(-2.5,2),type = "n",xlab="lambda",ylab="coef")
for (i in 2:340){
  lines(fit$lambda,df[i,],col = cl[i],type = 'l')
}

Best lambda through CV: deviance

lbds <- matrix(0,nrow=4,ncol=6)
lbds <- data.frame(lbds)
rownames(lbds)=c("CV_deviance","CV_misclassificaiton_rate","BIC","EBIC"); colnames(lbds) <- c("lambda","num_non_zero","CCR_train","HUM_train","CCR_test","HUM_test")
library(doParallel)
cl <- makeCluster(3)
registerDoParallel(cl)
fit.cv <- cv(x=trainx, classes=trainy, fold = 10, alpha = 0.5, lambda = l,
             use_parallel = T,grouping =index,standardize = TRUE, intercept = TRUE)
## Warning in for (i in 1L:d2) {: 关闭不再使用的链结10(<-Gao-PC:11314)
## Warning in for (i in 1L:d2) {: 关闭不再使用的链结9(<-Gao-PC:11314)
## Warning in for (i in 1L:d2) {: 关闭不再使用的链结8(<-Gao-PC:11314)
## Running msgl 10 fold cross validation (dense design matrix)
## 
##  Samples:  Features:  Classes:  Groups:  Parameters: 
##     2.949k        340         2      266          680
id <- which(Err(fit.cv, type = "loglike")==min(Err(fit.cv, type = "loglike")))[1]
lbds[1,1]<- l[id]

Lambda <- l
xlim <- range(Lambda)
plot(Lambda,Err(fit.cv, type = "loglike"),xlim=xlim,pch=19,type="o",ylab="CV_dev",col="red")
abline(v=lbds[1,1],lwd=3)

Coefs

The best lambda is 0.0034035.

as.matrix(coef(fit, id))[1,]
##          Intercept            gender2            agegp22 
##       1.545380e+00      -4.085206e-01      -2.382046e-02 
##            agegp23            agegp24            agegp25 
##       4.671626e-02      -1.986130e-02       3.255235e-01 
##            bppul_f         anti_chol1      drugs_others1 
##      -3.421861e-03       1.448584e-01       3.964860e-01 
##     drugs_unknown1      hypertension1     hypertension32 
##       3.529373e-03       4.571243e-01       1.316352e-02 
##  hyperlipidaemia11        blood_data1             bldglu 
##       4.123419e-01      -3.513039e-01       1.066707e-02 
##               chol            GFR_EPI           CKD_EPI1 
##      -3.004650e-01      -7.056104e-03      -4.519768e-02 
##    better_eye_bvaR       pvalogr_USA2       bvalogr_USA2 
##      -1.066048e-01       1.702083e-02       5.505719e-02 
##       bvalogl_WHO2       bvalogl_WHO3       undercorr_L1 
##       1.500131e-01      -7.748809e-02       3.826679e-02 
##           srepsphl           srepcylr           srepcyll 
##      -1.483967e-03      -8.270729e-02       8.541625e-02 
##     anisometropia1    highhyperopiar1       emmetropiar1 
##       1.323592e-01      -1.097117e-02      -1.106608e-02 
##    highmyopia_pva1 highhyperopia_pva1    emmetropia_pva1 
##       1.355630e-01      -2.649955e-02      -9.790805e-02 
##              opp_R          lenstatr2          lenstatr4 
##      -4.782955e-03       2.884901e-06       1.743796e-04 
##      cocataract_L1              race2          job_cat32 
##       1.254231e-02       1.039095e-02       4.983708e-02 
##              home3              home4             smkyn2 
##      -4.400031e-04       1.006029e-03      -3.216387e-01 
##           smk_cat3             catyn2               ang2 
##       3.030527e-02       1.083345e-02      -1.149143e+00 
##            htchol2              avccr          IVAN_CRAE 
##      -3.524412e-02      -1.520509e-01      -1.781839e-03 
##           IVAN_AVR     L_AMDgradable1   any_AMDgradable1 
##      -3.532818e-01       3.848282e-01      -1.228562e+00 
##         pigment_R1       R_early_amd1      geoatrophy_L1 
##       1.101177e-01       5.925148e-02       4.938554e-01 
##    L_late_amd_dry1      any_late_amd1    any_DRgradable1 
##      -3.511918e-01      -5.395305e-01      -2.981747e-03 
##      R_retino_cat1      R_retino_cat2      R_retino_cat3 
##      -1.667822e-02       4.783728e-01       6.948303e-02 
##      R_retino_cat5            R_CSME1          CSME_any1 
##       9.901070e-02       3.461595e-01       2.474139e-01 
##        sec_glau_R1         pri_ACD_R1            POAG_L1 
##       1.789268e-01      -6.339169e-02       2.455129e-01 
##          PACG_any1   CataractGrading1         nuclear_L1 
##      -4.638681e-04       4.203605e-01       4.042717e-02 
##         rcrtgp2_R2         rcrtgp2_R3          rpscar2_R 
##       3.160647e-03      -9.370072e-02       5.863833e-03 
##         rpscgp2_R2           any_psc1 
##       9.028384e-02       2.116627e-02
lbds[1,2] <- length(as.matrix(coef(fit, id))[1,])

There are 74 non-zero vbs.

Acurracy
library(mcca)
res <- predict(fit, trainx)
pre<- res$response[[id]]
lbds[1,3] <- ccr(y=trainy,d=t(pre),method="prob",k=2)
lbds[1,4] <- hum(y=trainy,d=t(pre),method="prob",k=2)

res <- predict(fit, testx)
pre<- res$response[[id]]
lbds[1,5] <- ccr(y=testy,d=t(pre),method="prob",k=2)
lbds[1,6] <- hum(y=testy,d=t(pre),method="prob",k=2)
cat("Correct classification rate of test data:",lbds[1,5])
## Correct classification rate of test data: 0.9602446
cat("Area under curve of test data:",lbds[1,6])
## Area under curve of test data: 0.8404838

Best lambda through CV: ME

id <- which(Err(fit.cv, type = "rate")==min(Err(fit.cv, type = "rate")))[1]
lbds[2,1]<- l[id]
plot(Lambda,Err(fit.cv, type = "rate"),xlim=xlim,pch=19,type="o",ylab="CV_dev",col="red")
abline(v=lbds[2,1],lwd=3)

Coefs

The best lambda is 0.005706.

as.matrix(coef(fit, id))[1,]
##         Intercept           gender2           agegp22           agegp23 
##     -0.7492293199     -0.3089418187     -0.0125955592      0.0196953985 
##           agegp25           bppul_f          anti_ht1        anti_chol1 
##      0.1693088481     -0.0003835190      0.0584737902      0.4540023401 
##     drugs_others1     hypertension1 hyperlipidaemia11       blood_data1 
##      0.3573195358      0.2596668727      0.0749868898     -0.0614854941 
##            bldglu              chol           GFR_EPI   better_eye_bvaR 
##      0.0030411046     -0.2351715220     -0.0056398891     -0.0834264798 
##      bvalogr_USA2          srepcylr    anisometropia1   emmetropia_pva1 
##      0.0422656882     -0.0489389904      0.0834752344     -0.0386819443 
##            smkyn2          smk_cat3              ang2         IVAN_CRAE 
##     -0.2836284095      0.0350365447     -1.0838605311     -0.0003267495 
##    L_AMDgradable1  any_AMDgradable1        pigment_R1     any_late_amd1 
##      0.1225242418     -0.7324240117      0.0470319630     -0.0854735414 
##   any_DRgradable1     R_retino_cat2     R_retino_cat3     R_retino_cat5 
##     -0.0030994879      0.3315163995      0.0595516881      0.0847097416 
##           R_CSME1         CSME_any1           POAG_L1         rpscar2_R 
##      0.1594425583      0.2222287297      0.1100003720      0.0053894390 
##        rpscgp2_R2          any_psc1 
##      0.0023600633      0.0220580679
lbds[2,2] <- length(as.matrix(coef(fit, id))[1,])

There are 38 non-zero vbs.

Acurracy
library(mcca)
res <- predict(fit, trainx)
pre<- res$response[[id]]
lbds[2,3] <- ccr(y=trainy,d=t(pre),method="prob",k=2)
lbds[2,4] <- hum(y=trainy,d=t(pre),method="prob",k=2)

res <- predict(fit, testx)
pre<- res$response[[id]]
lbds[2,5] <- ccr(y=testy,d=t(pre),method="prob",k=2)
lbds[2,6] <- hum(y=testy,d=t(pre),method="prob",k=2)
cat("Correct classification rate of test data:",lbds[2,5])
## Correct classification rate of test data: 0.9602446
cat("Area under curve of test data:",lbds[2,6])
## Area under curve of test data: 0.8457325

Best lambda through BIC

lik <- rep(0,200)
res <- predict(fit, trainx)
for (i in 1:200){
  pre<- res$response[[i]]
  d=t(pre)
  lik[i] <- sum(log(d[trainy==0,1]))+ sum(log(d[trainy==1,2]))
}

df <- rep(0,200)
for (i in 1:200){
  df[i] <- length(as.matrix(coef(fit, i))[1,])
}
bic <- -1*2*lik+df*log(dim(trainx)[1])
Lambda <- fit$lambda
xlim <- range(Lambda)
plot(Lambda,bic,xlim=xlim,pch=19,type="o",ylab="BIC",col="red")
id <- which(bic==min(bic))
lbds[3,1] <- Lambda[id]
abline(v=lbds[3,1],lwd=3)

Coefs

The best lambda is 0.0125053.

as.matrix(coef(fit, id))[1,]
##     Intercept       gender2           age      anti_ht1    anti_chol1 
##  -1.498544960  -0.118253876   0.001600595   0.103841250   0.477021800 
## drugs_others1          chol       GFR_EPI        smkyn2          ang2 
##   0.225972115  -0.166133345  -0.005971500  -0.207175487  -0.946986846
lbds[3,2] <- length(as.matrix(coef(fit, id))[1,])

There are 10 non-zero vbs.

Acurracy
library(mcca)
#res <- predict(fit, trainx)
pre<- res$response[[id]]
lbds[3,3] <- ccr(y=trainy,d=t(pre),method="prob",k=2)
lbds[3,4] <- hum(y=trainy,d=t(pre),method="prob",k=2)

res <- predict(fit, testx)
pre<- res$response[[id]]
lbds[3,5] <- ccr(y=testy,d=t(pre),method="prob",k=2)
lbds[3,6] <- hum(y=testy,d=t(pre),method="prob",k=2)
cat("Correct classification rate of test data:",lbds[3,5])
## Correct classification rate of test data: 0.9633028
cat("Area under curve of test data:",lbds[3,6])
## Area under curve of test data: 0.8400274

Best lambda through EBIC

ebic <- bic+df*2*1*log(dim(trainx)[2])
plot(Lambda,ebic,xlim=xlim,pch=19,type="o",ylab="EBIC",col="red")
id <- which(ebic==min(ebic))
lbds[4,1] <- Lambda[id]
abline(v=lbds[4,1],lwd=3)

Coefs

The best lambda is 0.0148558.

as.matrix(coef(fit, id))[1,]
##     Intercept       gender2      anti_ht1    anti_chol1 drugs_others1 
##  -1.473369292  -0.067060822   0.077682024   0.466963562   0.184767312 
##          chol       GFR_EPI        smkyn2          ang2 
##  -0.146164806  -0.006094271  -0.171624229  -0.904404551
lbds[4,2] <- length(as.matrix(coef(fit, id))[1,])

There are 9 non-zero vbs.

Acurracy
library(mcca)
res <- predict(fit, trainx)
pre<- res$response[[id]]
lbds[4,3] <- ccr(y=trainy,d=t(pre),method="prob",k=2)
lbds[4,4] <- hum(y=trainy,d=t(pre),method="prob",k=2)

res <- predict(fit, testx)
pre<- res$response[[id]]
lbds[4,5] <- ccr(y=testy,d=t(pre),method="prob",k=2)
lbds[4,6] <- hum(y=testy,d=t(pre),method="prob",k=2)
cat("Correct classification rate of test data:",lbds[4,5])
## Correct classification rate of test data: 0.9633028
cat("Area under curve of test data:",lbds[4,6])
## Area under curve of test data: 0.8411684

Comparison

df <- rbind(lbdl,lbd,lbds)
DT::datatable(df)
save(df, file="./df.Rdata")